Setup

library(tidyverse)
library(magrittr)
library(ngsReports)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(AnnotationHub)
library(ensembldb)
library(edgeR)
library(corrplot)
library(cqn)
library(DT)
library(Gviz)
library(AnnotationFilter)
library(Rsamtools)
library(kmer)
library(furrr)
library(pander)
library(ggrepel)
if (interactive()) setwd(here::here())
theme_set(theme_bw())

Sequence information

ah_Dr <- AnnotationHub() %>%
  subset(species == "Danio rerio") %>%
  subset(rdataclass == "EnsDb")
ensDb_Dr <- ah_Dr[["AH83189"]]
trEns_Dr <- transcripts(ensDb_Dr) %>%
  mcols() %>% 
  as_tibble()
trLen_Dr <- exonsBy(ensDb_Dr, "tx") %>%
  width() %>%
  vapply(sum, integer(1))
geneGcLen_Dr <- trLen_Dr %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns_Dr) %>%
  group_by(gene_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
trGcLen_Dr <- trLen_Dr %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns_Dr) %>%
  group_by(tx_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
genesGR_Dr <- genes(ensDb_Dr)
mcols(genesGR_Dr) <- mcols(genesGR_Dr)[c("gene_id", "gene_name", 
                                         "gene_biotype", "entrezid")]
txGR_Dr <- transcripts(ensDb_Dr)
mcols(txGR_Dr) <- mcols(txGR_Dr)[c("tx_id", "tx_name", 
                                   "tx_biotype", "tx_id_version", "gene_id")]

An EnsDb object was obtained for Ensembl release 101 using the AnnotationHub package. This provided the GC content and length for every gene and transcript in the release. For zebrafish, this consists of 37241 genes and 65905 transcripts.

Raw data

This is a total RNA dataset generated from a 3-way comparison of WT zebrafish (Danio rerio) with heterozygous mutants (psen2S4Ter/+) and homozygous mutants (psen2S4Ter/S4Ter). Previous analysis of this dataset identified the possibility of incomplete ribosomal RNA (rRNA) removal (previous analysis).This dataset was chosen for an initial inspection as it is expected that rRNA sequences in zebrafish are more divergent from more common model organisms such as mice. Hence, inefficient rRNA depletion by kits that are generally optimised for these common model organisms may produce a stronger signal in zebrafish.

Sample information

files <- list.files(
  path = "/hpcfs/users/a1647910/20200902_Psen2S4Ter/0_rawData/FastQC",
  pattern = "zip",
  full.names = TRUE
)
samples <- tibble(
  sample = str_remove(basename(files), "_fastqc.zip"),
  dataset = NA,
  organism = NA
) %>%
  mutate(
    # sample = ifelse(
    #   str_detect(sample, "Ps2Ex"), str_remove(sample, "_R1"), sample
    # ),
    dataset = ifelse(
      str_detect(sample, "Ps2Ex"), "Psen2S4Ter", dataset
    ),
    organism = ifelse(
      str_detect(sample, "(SRR213|SRR218|Ps2Ex)"), "zebrafish", organism
    )
  )
datasets <- samples$dataset %>% 
  unique()

The following analysis involves 24 samples across 1 dataset(s): Psen2S4Ter.

Library sizes

rawFqc <- files %>%
  FastqcDataList()
data <- grep("Ps2Ex", fqName(rawFqc))
labels <- rawFqc[data] %>%
  fqName() %>%
  str_remove("_6month_07_07_2016_F3") %>%
  str_remove("\\.fastq\\.gz") %>%
  str_remove("Ps2Ex3M1_")
rawLib <- plotReadTotals(rawFqc[data]) +
  labs(subtitle = "Psen2S4Ter") + 
  scale_x_discrete(labels = labels)

The library sizes of the unprocessed dataset(s) range between 27,979,654 and 37,144,975 reads.

rawLib

GC content

rRNA transcripts are known to have high GC content. Therefore, inspecting the GC content of the raw reads is a logical starting point for detecting incomplete rRNA removal. A spike in GC content at > 70% is expected if this is the case.

plotly::ggplotly(
  plotGcContent(
    x = rawFqc[data], 
    plotType = "line",
    gcType = "Transcriptome",
    species = "Drerio"
  ) +
    labs(title = "Psen2S4Ter Dataset (D. rerio)") + 
    theme(legend.position="none")
) 

GC content of reads in the dataset. Clear spikes above 70% GC are observed, which is likely due to incomplete rRNA depletion.

Overrepresented sequences

The top 30 overrepresented sequences were analysed using blastn and were found to be predominantly rRNA sequences.

getModule(rawFqc, "Overrep") %>% 
  group_by(Sequence, Possible_Source) %>% 
  summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>% 
  arrange(desc(`Highest Percentage`), desc(`Found In`)) %>% 
  ungroup() %>% 
  dplyr::slice(1:30) %>%
  mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
  kable(
    align = "llrr", 
    caption = paste(
      "Top", nrow(.),"Overrepresented sequences.",
      "The number of samples they were found in is shown,",
      "along with the percentage of the most 'contaminated' sample."
    )
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
Top 30 Overrepresented sequences. The number of samples they were found in is shown, along with the percentage of the most ‘contaminated’ sample.
Sequence Possible_Source Found In Highest Percentage
GTGGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGC No Hit 12 1.72%
CTGGGGGAGCGGCCGCCCCGCGGCGCCCCCTCTCGTTCCCGTCTCCGGAG No Hit 10 1.69%
CCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATCC No Hit 12 1.32%
GGCCCGGCGCACGTCCAGAGTCGCCGCCGCACACCGCAGCGCATCCCCCC No Hit 9 1.31%
CTCCTGAAAAGGTTGTATCCTTTGTTAAAGGGGCTGTACCCTCTTTAACT No Hit 11 1.11%
GGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATC No Hit 12 1.09%
GGGGTGTACGAAGCTGAACTTTTATTCATCTCCCAGACAACCAGCTATTG No Hit 12 1.07%
GGCCCGGCGCACGTCCAGAGTCGCCGCCGCGCACCGCAGCGCATCCCCCC No Hit 10 1.03%
CGAGAGGCTCTAGTTGATATACTACGGCGTAAAGGGTGGTTAAGGAACAA No Hit 12 1.01%
GGGGGAGCGGCCGCCCCGCGGCGCCCCCTCTCGTTCCCGTCTCCGGAGCG No Hit 9 0.87%
CCTCCTTCAAGTATTGTTTCATGTTACATTTTCGTATATTCTGGGGTAGA No Hit 12 0.82%
CCCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATC No Hit 12 0.79%
GTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATCT No Hit 12 0.79%
GGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCAT No Hit 12 0.78%
CGGGTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG No Hit 12 0.73%
GGGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTT No Hit 11 0.68%
CAGTGGCGTGCGCCTGTAATCCAAGCTACTGGGAGGCTGAGGCTGGCGGA No Hit 11 0.64%
CGGGTCGGGTGGGTAGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG No Hit 12 0.57%
CTTAGACGACCTGGTAGTCCAAGGCTCCCCCAGGAGCACCATATCGATAC No Hit 11 0.54%
AGCTGGGGAGATCCGCGAGAAGGGCCCGGCGCACGTCCAGAGTCGCCGCC No Hit 11 0.53%
GGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTTA No Hit 10 0.53%
CAGCCTATTTAACTTAGGGCCAACCCGTCTCTGTGGCAATAGAGTGGGAA No Hit 12 0.51%
GGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGACGTGG No Hit 10 0.50%
GGGAGCGGCCGCCCCGCGGCGCCCCCTCTCGTTCCCGTCTCCGGAGCGCG No Hit 9 0.49%
CTGGGAGATGAATAAAAGTTCAGCTTCGTACACCCCAAATTAAAAAATTA No Hit 10 0.48%
GCCTATTTAACTTAGGGCCAACCCGTCTCTGTGGCAATAGAGTGGGAAGA No Hit 12 0.47%
GGTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGAC No Hit 11 0.45%
CCCCCGAACCCTTCCAAGCCGAACCGGAGCCGGTCGCGGCGCACCGCCGA No Hit 10 0.45%
GTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGACG No Hit 10 0.43%
GCCCACTACGACAACGTGTTTTGTAAATTATGATCTTTATTCTCCTGAAA No Hit 10 0.43%

Trimming

trimFqc <- list.files(
  path = "/hpcfs/users/a1647910/20200902_Psen2S4Ter/1_trimmedData/FastQC",
  pattern = "zip",
  full.names = TRUE
) %>%
  FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
  dplyr::rename(Raw = Total_Sequences) %>%
  left_join(readTotals(trimFqc), by = "Filename") %>%
  dplyr::rename(Trimmed = Total_Sequences) %>%
  mutate(
    Discarded = 1 - Trimmed/Raw,
    Retained = Trimmed / Raw
  )

After trimming of adapters between 4.16% and 5.08% of reads were discarded.

Aligned data

Trimmed reads were firstly aligned to rRNA sequences using the BWA-MEM algorithm to calculate the proportion of reads that were of rRNA origin within each sample. BWA-MEM is recommended for high-quality queries of reads ranging from 70bp to 1Mbp as it is faster and more accurate that alternative algorithms BWA-backtrack and BWA-SW.

rRNA proportions

bwaMapped <- read.delim(
  "/hpcfs/users/a1647910/20200902_Psen2S4Ter/3_bwa/log/allFiles.mapped", 
  sep = ":", 
  col.names = c("sample", "proportion"), 
  header = FALSE
) %>% 
  mutate(
    sample = str_remove(sample, ".mapped"),
    proportion = proportion/100,
    pair = str_remove(sample, "_R[1-2]"),
    read = ifelse(str_detect(sample, "R1"), "R1", "R2")
  )
rRnaProp <- samples %>%
  dplyr::filter(dataset == "Psen2S4Ter") %>%
  left_join(bwaMapped) %>%
  mutate(
    sample = str_remove(sample, "_6month_F3"),
    sample = str_remove(sample, "[0-9]*_Ps2Ex3M1_"),
    pair = str_remove(pair, "_6month_F3"),
    pair = str_remove(pair, "[0-9]*_Ps2Ex3M1_")
  )
rRnaProp$dataset %<>%
  factor(levels = c("Psen2S4Ter"))
rRnaProp %>%
  ggplot(aes(sample, proportion, fill = read)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~dataset, scales = "free_x") +
  scale_y_continuous(labels = percent) +
  labs(x = "Sample", y = "Percent of Total") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
*Percentages of each library that align to rRNA sequences with `bwa mem`.*

Percentages of each library that align to rRNA sequences with bwa mem.

Following alignment to known rRNA sequences, reads that mapped were removed to create a separate fastq file that did not contain sequences identified as rRNA. These remaining sequences were aligned to the Danio rerio GRCz11 genome (Ensembl release 98) using STAR v2.7.0d and summarised with featureCounts from the Subread v1.5.2 package .

Gene GC content and length

dgeList <- read_tsv("/hpcfs/users/a1647910/20200902_Psen2S4Ter/2_alignedData/featureCounts/genes.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  set_colnames(str_remove(colnames(.), "_6month_F3")) %>%
  set_colnames(str_remove(colnames(.), "[0-9]*_Ps2Ex3M1_")) %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  DGEList() %>%
  calcNormFactors()
dgeList$genes <- genesGR_Dr[rownames(dgeList),]
mcols(dgeList$genes) %<>% 
  as.data.frame() %>% 
  left_join(geneGcLen_Dr)
addInfo <- tibble(
  sample = rRnaProp$pair %>%
    unique(),
  dataset = "Psen2S4Ter",
  organism = "zebrafish",
  R1_rRNA = rRnaProp %>%
    dplyr::filter(read == "R1") %>%
    .$proportion,
  R2_rRNA = rRnaProp %>%
    dplyr::filter(read == "R2") %>%
    .$proportion
) %>%
  mutate(ave_rRNA = (R1_rRNA + R2_rRNA)/2)
dgeList$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
dgeList$samples$filenames <- read_tsv(
  "/hpcfs/users/a1647910/20200902_Psen2S4Ter/2_alignedData/featureCounts/genes.out"
) %>% 
  dplyr::select(str_subset(colnames(.), "Ps2Ex3M1_")) %>%
  colnames()
gcInfo <- function(x) {
  x$counts %>%
    as.data.frame() %>%
    rownames_to_column("gene_id") %>%
    as_tibble() %>%
    pivot_longer(
      cols = colnames(x), 
      names_to = "sample", 
      values_to = "counts"
    ) %>%
    dplyr::filter(
      counts > 0
    ) %>%
    left_join(
      geneGcLen_Dr
    ) %>%
    dplyr::select(
      ends_with("id"), sample, counts, aveGc, maxLen
    ) %>%
    split(f = .$sample) %>%
    lapply(
      function(x){
        DataFrame(
          gc = Rle(x$aveGc, x$counts),
          logLen = Rle(log10(x$maxLen), x$counts)
        )
      }
    ) 
}
gcSummary <- function(x) {
  x %>%
    vapply(function(x){
      c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
    }, numeric(4)
    ) %>%
    t() %>%
    set_colnames(
      c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
    ) %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble()
}
rle <- gcInfo(dgeList)
sumGc <- gcSummary(rle)
a <- sumGc %>%
  left_join(dgeList$samples) %>%
  ggplot(aes(ave_rRNA, mn_logLen)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean log(Length)"
  ) 
b <- sumGc %>%
  left_join(dgeList$samples) %>%
  ggplot(aes(ave_rRNA, mn_gc)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean GC Content"
  )
ggarrange(a, b, ncol = 2, nrow = 1) %>%
  annotate_figure("PsenS4Ter Dataset (D. rerio)")
*Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*

Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.

PCA

genes2keep <- dgeList %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(6)
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()
pca <- cpm(dgeFilt, log = TRUE) %>%
  t() %>%
  prcomp()
pcaCor <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(sumGc) %>%
  as_tibble() %>% 
  left_join(dgeList$samples) %>%
  dplyr::select(
    PC1, PC2, PC3, 
    Mean_GC = mn_gc, 
    Mean_Length = mn_logLen, 
    rRna_Proportion = ave_rRNA
  ) %>% 
  cor()
a <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(size = 2) +
  labs(
    x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
    y = paste0("PC2 (", percent(summary(pca)$importance["Proportion of Variance","PC2"]),")")
  )
b <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(dgeList$samples) %>%
  ggplot(aes(PC1, ave_rRNA)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  labs(
    x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
    y = "rRNA Proportion of Initial Library"
  )
c <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(sumGc) %>%
  as_tibble() %>%
  ggplot(aes(PC1, mn_logLen)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
    y = "Mean log(Length)"
  )
d <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(sumGc) %>%
  as_tibble() %>%
  ggplot(aes(PC1, mn_gc)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  labs(
    x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
    y = "Mean GC"
  )
ggarrange(a, b, c, d, ncol = 2, nrow = 2) %>%
  annotate_figure("Psen2S4Ter")
*PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.*

PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.

corrplot(
  pcaCor,
  type = "lower", 
  diag = FALSE, 
  addCoef.col = 1, addCoefasPercent = TRUE
)
*Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.*

Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.

Differential expression

design <- model.matrix(~ave_rRNA, data = dgeFilt$samples)
voom <- voom(dgeFilt, design = design)
fit <- lmFit(voom, design = design)
eBayes <- eBayes(fit)
topTable <- eBayes %>%
  topTable(coef = colnames(design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  mutate(DE = adj.P.Val < 0.05) %>%
  unite(Location, c(seqnames, start, end, width, strand), sep = ":") %>%
  dplyr::select(
    Geneid = gene_id,
    Symbol = gene_name,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    Location,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()
topTable %>% 
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR, DE) %>%
  mutate(
    AveExpr = format(round(AveExpr, 2), nsmall = 2),
    logFC = format(round(logFC, 2), nsmall = 2),
    P.Value = sprintf("%.2e", P.Value),
    FDR = sprintf("%.2e", FDR)
  ) %>%
  dplyr::slice(1:200) %>%
  datatable(options = list(pageLength = 20), class = "striped hover condensed responsive", filter = "top")

k-mer analysis

k = 5

Counts

k5files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k5", pattern = "_dumps.txt", full.names = TRUE)
k5counts <- lapply(k5files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))

DGE List

k5dge <- k5counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k5dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k5dge$samples$group <- colnames(k5dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k5dge %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = TRUE, main = "Distribution of 5-mers")

k5dge$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library size") +
  scale_fill_discrete(
    name = "Genotype"
  )

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k5pca <- k5dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k5pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 3.74 1.622 0.6056 0.2888 0.2167 0.1588 0.1347 1.25e-14
Proportion of Variance 0.8152 0.1533 0.02138 0.00486 0.00274 0.00147 0.00106 0
Cumulative Proportion 0.8152 0.9685 0.9899 0.9947 0.9975 0.9989 1 1
# Plot PCA
k5pcaPlot <- k5pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k5dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k5pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 5"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k5design <- model.matrix(~ave_rRNA, data = k5dge$samples)
k5voom <- voom(k5dge, design = k5design)
k5fit <- lmFit(k5voom, design = k5design)
k5eBayes <- eBayes(k5fit)
k5topTable <- k5eBayes %>%
  topTable(coef = colnames(k5design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(DE = adj.P.Val < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 6

Counts

k6files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k6", pattern = "_dumps.txt", full.names = TRUE)
k6counts <- lapply(k6files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))

DGE List

k6dge <- k6counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k6dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k6dge$samples$group <- colnames(k6dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k6dge %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = TRUE, main = "Distribution of 6-mers")

k6dge$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library size") +
  scale_fill_discrete(
    name = "Genotype"
  )

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k6pca <- k6dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k6pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 9.688 3.969 1.922 1.003 0.6683 0.5474 0.4628 2.327e-14
Proportion of Variance 0.8143 0.1366 0.03205 0.00873 0.00387 0.0026 0.00186 0
Cumulative Proportion 0.8143 0.9509 0.9829 0.9917 0.9955 0.9981 1 1
# Plot PCA
k6pcaPlot <- k6pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k6dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k6pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 6"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k6design <- model.matrix(~ave_rRNA, data = k6dge$samples)
k6voom <- voom(k6dge, design = k6design)
k6fit <- lmFit(k6voom, design = k6design)
k6eBayes <- eBayes(k6fit)
k6topTable <- k6eBayes %>%
  topTable(coef = colnames(k6design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(DE = adj.P.Val < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 7

Counts

k7files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k7", pattern = "_dumps.txt", full.names = TRUE)
k7counts <- lapply(k7files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))

DGE List

k7dge <- k7counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k7dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k7dge$samples$group <- colnames(k7dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k7dge %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = TRUE, main = "Distribution of 7-mers")

k7dge$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library size") +
  scale_fill_discrete(
    name = "Genotype"
  )

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k7pca <- k7dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k7pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 23.71 8.571 4.459 2.676 2.021 1.546 1.259 2.899e-14
Proportion of Variance 0.8381 0.1096 0.02966 0.01068 0.00609 0.00356 0.00236 0
Cumulative Proportion 0.8381 0.9476 0.9773 0.988 0.9941 0.9976 1 1
# Plot PCA
k7pcaPlot <- k7pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k7dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k7pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 7"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k7design <- model.matrix(~ave_rRNA, data = k7dge$samples)
k7voom <- voom(k7dge, design = k7design)
k7fit <- lmFit(k7voom, design = k7design)
k7eBayes <- eBayes(k7fit)
k7topTable <- k7eBayes %>%
  topTable(coef = colnames(k7design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(DE = adj.P.Val < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 8

Counts

k8files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k8", pattern = "_dumps.txt", full.names = TRUE)
k8counts <- lapply(k8files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))

DGE List

k8dge <- k8counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k8dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k8dge$samples$group <- colnames(k8dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k8dge %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = TRUE, main = "Distribution of 8-mers")

k8dge$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library size") +
  scale_fill_discrete(
    name = "Genotype"
  )

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k8pca <- k8dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k8pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 45.19 15.73 8.783 6.407 5.455 3.795 2.882 8.602e-14
Proportion of Variance 0.8301 0.1005 0.03136 0.01669 0.0121 0.00586 0.00338 0
Cumulative Proportion 0.8301 0.9306 0.962 0.9787 0.9908 0.9966 1 1
# Plot PCA
k8pcaPlot <- k8pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k8dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k8pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 8"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k8design <- model.matrix(~ave_rRNA, data = k8dge$samples)
k8voom <- voom(k8dge, design = k8design)
k8fit <- lmFit(k8voom, design = k8design)
k8eBayes <- eBayes(k8fit)
k8topTable <- k8eBayes %>%
  topTable(coef = colnames(k8design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(DE = adj.P.Val < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 9

Counts

k9files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k9", pattern = "_dumps.txt", full.names = TRUE)
k9counts <- lapply(k9files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))

DGE List

k9dge <- k9counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k9dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k9dge$samples$group <- colnames(k9dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k9dge %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = TRUE, main = "Distribution of 9-mers")

k9dge$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library size") +
  scale_fill_discrete(
    name = "Genotype"
  )

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k9pca <- k9dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k9pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 75.39 29.61 16.6 13.77 11.93 8.868 6.986 3.232e-13
Proportion of Variance 0.7791 0.1202 0.03779 0.02598 0.01952 0.01078 0.00669 0
Cumulative Proportion 0.7791 0.8992 0.937 0.963 0.9825 0.9933 1 1
# Plot PCA
k9pcaPlot <- k9pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k9dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k9pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 9"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k9design <- model.matrix(~ave_rRNA, data = k9dge$samples)
k9voom <- voom(k9dge, design = k9design)
k9fit <- lmFit(k9voom, design = k9design)
k9eBayes <- eBayes(k9fit)
k9topTable <- k9eBayes %>%
  topTable(coef = colnames(k9design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(DE = adj.P.Val < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 10

Counts

k10files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k10", pattern = "_dumps.txt", full.names = TRUE)
k10counts <- lapply(k10files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))

DGE List

k10dge <- k10counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k10dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k10dge$samples$group <- colnames(k10dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k10dge %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = TRUE, main = "Distribution of 10-mers")

k10dge$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library size") +
  scale_fill_discrete(
    name = "Genotype"
  )

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k10pca <- k10dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k10pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 134.4 59.52 32.51 30.39 27.96 21.64 19.12 4.011e-13
Proportion of Variance 0.7167 0.1406 0.04194 0.03667 0.03104 0.01858 0.01451 0
Cumulative Proportion 0.7167 0.8573 0.8992 0.9359 0.9669 0.9855 1 1
# Plot PCA
k10pcaPlot <- k10pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k10dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k10pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 10"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k10design <- model.matrix(~ave_rRNA, data = k10dge$samples)
k10voom <- voom(k10dge, design = k10design)
k10fit <- lmFit(k10voom, design = k10design)
k10eBayes <- eBayes(k10fit)
k10topTable <- k10eBayes %>%
  topTable(coef = colnames(k10design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(DE = adj.P.Val < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

Results comparison

PCA

ggarrange(
  k5pcaPlot, k6pcaPlot, k7pcaPlot, k8pcaPlot, k9pcaPlot, k10pcaPlot,
  ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)

Differential expression

topRes <- function(x, cap){
  x %>% 
    dplyr::select(mer, AveExpr, logFC, P.Value, FDR, DE) %>%
    mutate(
      AveExpr = format(round(AveExpr, 2), nsmall = 2),
      logFC = format(round(logFC, 2), nsmall = 2),
      P.Value = sprintf("%.2e", P.Value),
      FDR = sprintf("%.2e", FDR)
    ) %>%
    dplyr::slice(1:200) %>%
    datatable(
      options = list(pageLength = 20),
      class = "striped hover condensed responsive",
      filter = "top",
      caption = cap
    )
}
topRes(k5topTable,
       cap = paste(
         "The top 200 differentially expressed 5-mers.",
         nrow(dplyr::filter(k5topTable, DE)),
         "of",
         nrow(k5topTable),
         "detected sequences were classified as DE with an FDR < 0.05."
       )
)
topRes(k6topTable,
       cap = paste(
         "The top 200 differentially expressed 6-mers.",
         nrow(dplyr::filter(k6topTable, DE)),
         "of",
         nrow(k6topTable),
         "detected sequences were classified as DE with an FDR < 0.05."
       )
)
topRes(k7topTable,
       cap = paste(
         "The top 200 differentially expressed 7-mers.",
         nrow(dplyr::filter(k7topTable, DE)),
         "of",
         nrow(k7topTable),
         "detected sequences were classified as DE with an FDR < 0.05."
       )
)
topRes(k8topTable,
       cap = paste(
         "The top 200 differentially expressed 8-mers.",
         nrow(dplyr::filter(k8topTable, DE)),
         "of",
         nrow(k8topTable),
         "detected sequences were classified as DE with an FDR < 0.05."
       )
)
topRes(k9topTable,
       cap = paste(
         "The top 200 differentially expressed 9-mers.",
         nrow(dplyr::filter(k9topTable, DE)),
         "of",
         nrow(k9topTable),
         "detected sequences were classified as DE with an FDR < 0.05."
       )
)
topRes(k10topTable,
       cap = paste(
         "The top 200 differentially expressed 10-mers.",
         nrow(dplyr::filter(k10topTable, DE)),
         "of",
         nrow(k10topTable),
         "detected sequences were classified as DE with an FDR < 0.05."
       )
)